# Given pi and dynamics
# Given set of states and actions
import numpy as np

class riskSensitiveDP():
    def __init__(self, num_states, num_actions,dynamics,r, beta=1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.beta = beta
        self.V = np.zeros(num_states)
        self.Q = np.zeros((num_states, num_actions))
        self.pi = np.zeros((num_states, num_actions))
        self.r = r.copy()
        self.dynamics = dynamics.copy()

    def value_iteration(self):
        iterations = 1000
        gamma = 0.9
        for i in range(iterations):
            if(i % 1000 == 0):
                print(i)
            # Estimation of state-action value functions
            self.Q *= 0
            for s in range(self.num_states):
                for a in range(self.num_actions):
                    self.Q[s,a] = np.sum(self.dynamics[s,a]*np.exp(self.V))
                    # for next_s in range(self.num_states):
                    #     self.Q[s,a] += self.dynamics[s,a,next_s] * np.exp(self.V[next_s])
                    if(s == 9):
                        self.Q[s, a] = self.r[s, a] / self.beta
                    else:
                        self.Q[s,a] = gamma * np.log(self.Q[s,a]) + self.r[s,a]/self.beta

            # Policy Improvement
            self.V = np.max(self.Q,axis=1)
        # print(self.V.reshape(-1,10))
        # import matplotlib.pyplot as plt
        # plt.imshow(self.V.reshape(-1,10))
        # plt.show()
        return np.argmax(self.Q,axis=1), self.Q


